library(tidyverse)
library(urltools)
library(lubridate)
library(stargazer)

# from archived 2_..
input_file <- "../data/news_domain_urls.csv"

urls <- read_csv(input_file)
urls <- urls %>%
  select(-used_at) %>% na.omit()

urls <- urls %>% mutate(type = ifelse(type == "aggregator", "portal", type))

urls %>% pull(caseid) %>% n_distinct()

parsed <- urls %>% pull(url) %>% url_parse() %>% tibble()
suffix <- parsed %>% pull(domain) %>% suffix_extract() %>% tibble()
urls <- bind_cols(urls %>% select(-domain, -path),
  parsed %>% select(path), suffix) %>%
  select(-suffix, -subdomain, -host) %>% 
  mutate(path = if_else(is.na(path), "", path)) %>% 
  filter(!is.na(domain))



issue_keywords <- c("gun", "rifle", "abortion", "tax", "immigrant",
  "immigration", "budget deficit", "defense spending",
  "social security", "environment", "jobs", "crime", "national security",
  "race relations", "healthcare", "government corruption",
  "national economy", "tariff",
  "assault rifle", "concealed", "undocumented", "border patrol",
  "refugee", "visas", "ban muslims",
  "muslim ban", "abortion", "abortions", "clean energy",
  "mandatory minimum", "mandatory minimums", "body camera",
  "body cameras", "three-strike", "gay marriage",
  "defense spending", "domestic spending",
  "raise taxes", "tax cut", "obamacare", "minimum wage")

issue_keywords <- paste0(issue_keywords, collapse = " ")
issue_keywords <- str_replace_all(issue_keywords, " ", "|")

urls <- urls %>% mutate(issue = 1 * str_detect(path, issue_keywords),
  issue = ifelse(is.na(issue), 0, issue))
# to catch NA paths

political_keywords <- read_csv("../data/PoliticalKeyWords.csv",
  col_names = FALSE)
political_keywords <- political_keywords %>% pull(X1) %>%
  c(., "washington", "gop", "dnc", "rnc", "voting", "bernie", "sanders",
  "assange", "kaepernick", "breitbart", "debate", "candidate",
  "tax", "wikileak", "ballot", "chaffetz", "parties", "PACs",
  "proposition", "flynn", "abedin", "weiner") %>% 
  str_c(collapse = "|")

urls <- urls %>% mutate(political = 1 * str_detect(path, political_keywords),
  political = ifelse(is.na(political), 0, political))

urls <- urls %>% mutate(portal = 1 * (type == "portal"))


urls %>% select(issue, political)

urls %>% filter(issue == 1) %>% with(mean(political)) %>% round(2)
urls %>% filter(political == 1) %>% with(mean(issue)) %>% round(2)

urls$political %>% mean() %>% round(2)
urls %>% mutate(m = 1 * (issue + political > 0)) %>%
  mean(m) %>% round(2)




urls %>% pull(portal) %>% mean() %>% round(2)
urls %>% filter(portal == 1) %>% pull(issue) %>% mean() %>% round(2)
urls %>% filter(portal == 0) %>% pull(issue) %>% mean() %>% round(2)

urls <- urls %>% filter(issue == 1) %>% select(-issue, -political)

urls %>% pull(caseid) %>% n_distinct()

# dup_urls <- urls
# dup_counts <- dup_urls %>% group_by(caseid) %>% summarise(count = n() / 99)
# dup_counts <- dup_counts %>% arrange(desc(count))
# dup_urls %>% filter(caseid == dup_counts$caseid[1]) %>%
#   arrange(domain, path, date) %>% pull(url)

urls <- urls %>% arrange(desc(duration)) %>%
  distinct(caseid, domain, date, path, .keep_all = TRUE) %>% 
  arrange(caseid, date)

counts <- urls %>% group_by(caseid) %>%
  summarise(count = n() / 99) %>% 
  arrange(desc(count))

# # Checks out
# urls %>% filter(caseid == counts$caseid[1]) %>%
#   arrange(domain, path, date) %>% pull(url)
# urls %>% filter(caseid == counts$caseid[2]) %>%
#   arrange(domain, path, date) %>% pull(url)

# counts %>%
#   ggplot(aes(count)) +
#   geom_histogram() +
#   theme_classic()

urls %>% write_csv("../results/issue_urls.csv")